library(soundgen)
library(magrittr); library(tidyverse)
tuneR::setWavPlayer('/usr/bin/afplay')
path_to_wav <- "data/01_raw_data/pilot-ads-ids-segments"
point_color = "darkorange"
model_fit_color = "dodgerblue"
First we define global parameters for pitch tracking.
lowest_pitch <- 75
highest_pitch <- 1000
sampling_rate <- 100 # same sampling rate as Rasanen et al., 2018
hz_lims <- c(0.1, 0.8)
Under the hood of soundgen analyze() is the short-time Fourier transform (STFT)
* look at one short segment of sound at a time (one STFT frame),
* analyze its spectrum using Fast Fourier Transform (FFT),
* and then move on to the next - perhaps overlapping - frame.
As the analysis window slides along the signal, STFT shows which frequencies it contains at different points of time.
First, we grab the audio segments that we want to analyze.
files_to_analyze <- list.files(here::here(path_to_wav))
ads_seg <- files_to_analyze[1]
ids_seg <- files_to_analyze[2]
We can listen to a file using the playme() function like this:
playme(here::here(path_to_wav, ads_seg))
Next, we use the analyze() function to generate features about the audio segment.
a_ads <- analyze(x = here::here(path_to_wav, ads_seg),
pitchFloor = lowest_pitch,
pitchCeiling = highest_pitch,
samplingRate = sampling_rate,
ylim = hz_lims,
priorMean = HzToSemitones(300), priorSD = 6)
What is median pitch?
a_ads$pitch %>% median(na.rm=T)
## [1] 238.8705
How does pitch of segment change over time?
qplot(a_ads$time, a_ads$pitch, geom = 'line') +
labs(x = "Time (ms)", y = "Pitch (Hz)")
playme(here::here(path_to_wav, ids_seg))
a_ids <- analyze(x = here::here(path_to_wav, ids_seg),
samplingRate = sampling_rate,
pitchFloor = lowest_pitch, pitchCeiling = highest_pitch,
ylim = hz_lims,
showLegend = FALSE)
# Store pitch tracking information in a tibble
d_pitch <- tibble(
segment_type = 'ids',
pitch = a_ids$pitch,
time_utt_ms = a_ids$time
)
First we have to interpolate between regions of the audio segment that did not have pitch measurements (i.e., unvoiced speech).
To do this we can use the cubicspline() function from the pracma package.
x <- d_pitch %>% filter(!is.na(pitch)) %>% pull(time_utt_ms)
y <- d_pitch %>% filter(!is.na(pitch)) %>% pull(pitch)
pp <- pracma::cubicspline(x, y)
ppfun <- function(x) {pracma::ppval(pp, x)}
We also log transform the pitch values because people’s perception of pitch change logarithmically.
d_pitch %<>%
mutate(pitch_int = ppfun(time_utt_ms),
log_pitch = log(pitch),
z_log_pitch = scale(log_pitch))
# check z score worked
d_pitch$z_log_pitch %>% mean(na.rm = T)
## [1] -7.496583e-16
d_pitch$z_log_pitch %>% sd(na.rm = T)
## [1] 1
Plot the interpolation function.
a <- d_pitch %>%
filter(time_utt_ms >= min(x),
time_utt_ms <= max(x)) %>%
ggplot() +
geom_line(aes(x = time_utt_ms, y = pitch_int),
color = model_fit_color, size = 1) +
#geom_point(aes(x = time_utt_ms, y = pitch_int), color = 'darkred') +
geom_point(aes(x = time_utt_ms, y = pitch), size = 3, color = point_color) +
lims(y = c(0, 700)) +
labs(x = "Time (ms)", y = "Pitch (Hz)", title = "Cubic Spline")
Hmm, this looks a lot wigglier than the interpolated points in the Rasanen paper. They also used 30-ms median filtering to the resulting pitch tracks to remove single outlier values.
# Use span to control the "wiggliness" of the default loess smoother.
# The span is the fraction of points used to fit each local regression:
# small numbers make a wigglier curve, larger numbers make a smoother curve.
frac_points <- 0.18
b <- d_pitch %>%
filter(time_utt_ms >= min(x),
time_utt_ms <= max(x)) %>%
ggplot() +
geom_smooth(aes(x = time_utt_ms, y = pitch),
span = frac_points, se = F,
color = "dodgerblue") +
geom_point(aes(x = time_utt_ms, y = pitch), size = 3,
color = 'darkorange') +
lims(y = c(0, 700)) +
labs(x = "Time (ms)", y = "Pitch (Hz)") +
labs(title ="Loess Fit")
Plot the two intrporolation functions next to each other
cowplot::plot_grid(a, b)
Next, we can make the same plot but using the log transformed and normalized pitch values.
d_pitch %>%
filter(time_utt_ms >= min(x),
time_utt_ms <= max(x)) %>%
ggplot(aes(x = time_utt_ms, y = z_log_pitch)) +
geom_hline(yintercept = 0, lty = "dashed") +
geom_smooth(span = frac_points, se = F, color = model_fit_color) +
geom_point(color = point_color, size = 3) +
labs(x = "Time (ms)", y = "Z-score normalized log(pitch)") +
lims(y = c(-2,2)) +
theme(panel.grid = element_blank())
# define the model
m_loess <- loess(z_log_pitch ~ time_utt_ms, span = frac_points,
data = d_pitch)
# predict fitted values for each observation in the original dataset
df_preds <- predict(m_loess,
newdata = d_pitch,
se = TRUE) %>%
data.frame()
Calculate sample entropy of the interpolated pitch contour. Sample entropy was introduced to quantify the the amount of regularity and the unpredictability of fluctuations in a time series. A low value of the entropy indicates that the time series is deterministic; a high value indicates uncertainty or more information content.
samp_ent_ids <- df_preds %>%
filter(!is.na(fit)) %>%
pull(fit) %>%
pracma::sample_entropy() %>%
round(2)
Plot the predicted values from the loess model.
loess_preds_ids <- df_preds %>%
mutate(time = d_pitch$time_utt_ms) %>%
ggplot(aes(x = time, y = fit)) +
geom_line(color = model_fit_color, size = 1) +
geom_point(color = point_color, size = 2) +
labs(x = "Time (ms)", y = "Z-score log(pitch)",
subtitle = paste('sample entropy = ', samp_ent_ids)) +
lims(y = c(-2,2))
Now do the same thing with the ADS segment.
# extract pitch contour
d_pitch_ads <- tibble(
segment_type = 'ads',
pitch = a_ads$pitch,
time_utt_ms = a_ads$time
)
# interpolate, log transform, and z-score
d_pitch_ads %<>%
mutate(pitch_int = ppfun(time_utt_ms),
log_pitch = log(pitch),
z_log_pitch = scale(log_pitch))
# define the model
m_loess_ads <- loess(z_log_pitch ~ time_utt_ms, span = frac_points,
data = d_pitch_ads)
# predict fitted values for each observation in the original dataset
df_preds_ads <- predict(m_loess_ads,
newdata = d_pitch_ads,
se = TRUE) %>%
data.frame() %>%
mutate(time = d_pitch_ads$time_utt_ms)
# calculate entropy
samp_ent <- df_preds_ads %>%
filter(!is.na(fit)) %>%
pull(fit) %>%
pracma::sample_entropy() %>%
round(2)
# plot
loess_preds_ads <- df_preds_ads %>%
ggplot(aes(x = time, y = fit)) +
geom_line(color = model_fit_color, size = 1) +
geom_point(color = point_color, size = 2) +
labs(x = "Time (ms)", y = "Z-score log(pitch)",
subtitle = paste('sample entropy = ', samp_ent)) +
lims(y = c(-2.5,2.5))
Plot IDS and ADS curves next to each other:
cowplot::plot_grid(loess_preds_ids, loess_preds_ads,
labels = c('(IDS)', '(ADS)'),
scale = 0.85)
Here we replicate the approach of using uniformly spaced segments where F0 was sampled and predicted at regular intervals. Following, Rasanen et al., 2018 we use 100-ms non-overlapping segments.
Voicing information was used to determine the first and last valid segment of each utterance.
Add 100 ms time bins to pitch data frame.
df_preds_ads %<>%
mutate(time_bin = cut_width(time, width = 100, boundary = 0))
Plot the 100 ms time bins.
df_preds_ads %>%
ggplot(aes(x = time, y = fit, color = time_bin)) +
geom_line(size = 1) +
guides(color = F)
p <- pracma::polyfit(x, y, 2)